#!/bin/bash

if [ $# -ne 3 ]
then
   cat <<EOF
collect: Completion Project corpus collecter
Downloads and filters a public corpus from Google
---
Usage: ./collect <n> <begin> <end>
n -- integer, for which n-grams to download
begin, end -- integers, indices of parts of Google corpus
---
Results will appear in "<n>-grams" folder
EOF
  exit 1
fi

n=$1
begin=$2
end=$3

mkdir "$n"grams

for i in `seq $begin $end`
do
    echo PART $i
    name=googlebooks-eng-all-"$n"gram-20090715-$i.csv;
    wget http://commondatastorage.googleapis.com/books/ngrams/books/$name.zip
    unzip $name.zip
    rm $name.zip
    gcc -D_FILE_OFFSET_BITS=64 -O2 pystsch.c -o pystsch.out && time ./pystsch.out $name
    rm $name
    mv out "$n"grams/$i
done

